from nltk.corpus import stopwords
from pymystem3 import Mystem
import pandas
import gensim
from gensim.models.phrases import Phraser


input_file = "DATA_ukraina_ru_vaccine.csv"
output_file = "TOKENS_ukraina_ru_vaccine.csv"


added_stopwords = ["который", "также", "свой", "год", "это", "весь", "мочь", "становиться"]



#Data import
df = pandas.read_csv(input_file)

headlines = df["title"].tolist()
articles = df["full_text"].tolist()

#Copied a lot of code from here: https://www.kaggle.com/alxmamaev/how-to-easy-preprocess-russian-text
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
for word in added_stopwords:
    russian_stopwords.append(word)

def pre_process_text(text):
    #need to add \n to strip.() doesn't work otherwise 
    new_text = text.lower().strip("\n")
    lemmas = mystem.lemmatize(new_text)
    tokens = [token for token in lemmas if token not in russian_stopwords and token not in [" ", "  ", "\n", " \n"]]
    return tokens


processed_articles = []
for article in articles:
    print("Lemmatizing document: " + str(articles.index(article) + 1) + " out of " + str(len(articles)))
    processed_articles.append(pre_process_text(article))

#Got code from here: https://stackoverflow.com/questions/46129335/get-bigrams-and-trigrams-in-word2vec-gensim
#For bigrams to work, need the data to be structured as a list of lists, where the inner lists are each of the texts tokenized and lemmatized
#In theory the texts are called "sentences" in the documentation, but they don't have to be sentences per say, just have to be overall a list of lists of words
#Higher min count and threshold means fewer phrases. 
bigram = gensim.models.phrases.Phrases(processed_articles, min_count=5, threshold=10)
bigram_phraser = Phraser(bigram)
bigram_articles = []
for article in processed_articles:
    bigram_tokens = bigram_phraser[article]
    bigram_articles.append(bigram_tokens)

#Now take the above results and put it through again to make trigrams:
trigram = gensim.models.phrases.Phrases(bigram_articles, min_count = 5, threshold = 10)
trigram_phraser = Phraser(trigram)
trigram_articles = []
for article in bigram_articles:
    trigram_tokens = trigram_phraser[article]
    trigram_articles.append(trigram_tokens)

text_tidy = []


for article in trigram_articles:
    for token in article:
        d = {}
        d["token"] = token
        d["index"] = trigram_articles.index(article)
        text_tidy.append(d)
        
        

df = pandas.DataFrame(text_tidy)
df.to_csv(output_file,  encoding = "utf-8-sig")


